import numpy as np
from scipy.sparse import coo_matrix
import sqlite3
conn = sqlite3.connect("mydatabase.db")
cursor = conn.cursor()
def get_data(start=0,count = 10000):
# count = 100000
cursor.execute("SELECT * FROM cross limit {0}".format(count))
cross_tab = np.array(cursor.fetchall())[:,1:]
cursor.execute("SELECT COUNT(song_id) FROM cross WHERE id < {0}".format(count))
num_col = cursor.fetchall()[0][0]
cursor.execute("SELECT COUNT(playlist_id) FROM cross WHERE id < {0}".format(count))
num_row = cursor.fetchall()[0][0]
row = cross_tab[:,0]
col = cross_tab[:,1]
data = np.ones(cross_tab.shape[0])
cross_tab = np.append(cross_tab, [[row[-1] + 1,1]], axis = 0)
# print(cross_tab)
num_row += 1
cross_tab = coo_matrix((data, (row, col)), shape=(num_row, num_col))
coo_matrix.sum_duplicates(cross_tab)
return cross_tab
cross_tab = get_data()
# from sklearn.model_selection import train_test_split
from lightfm.cross_validation import random_train_test_split
train, test = random_train_test_split(cross_tab, test_percentage=0.3)
C:\Users\HeavyChevy\anaconda3\lib\site-packages\lightfm\_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used. warnings.warn(
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
model = LightFM(learning_rate=0.05, loss='warp',no_components = 16)
model.fit_partial(train, epochs=15)
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()
train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
Precision: train 0.89, test 0.01. AUC: train 1.00, test 0.42.
item_embeddings = model.item_embeddings
user_embeddings = model.user_embeddings
with open('item_embeddings.npy', 'wb') as f:
np.save(f, item_embeddings)
with open('user_embeddings.npy', 'wb') as f:
np.save(f, user_embeddings)
cursor.execute("SELECT artist_name FROM songs WHERE id < 100000 GROUP BY artist_name ORDER BY COUNT(*) DESC LIMIT 100;")
artists = np.array(cursor.fetchall())
# artists
selected_songs = []
res_artist = []
un_art = {}
for art in artists:
cursor.execute("SELECT id FROM songs WHERE (artist_name = '{0}') and (id < 10000)".format(art[0]))
tmp = cursor.fetchall()
selected_songs += tmp
res_artist += [art[0]] * len(tmp)
un_art[art[0]] = 1
# selected_songs
emb = np.load('item_embeddings.npy')
res_pts = []
res_songs = []
for sl in selected_songs:
res_songs.append(emb[sl])
# res_songs
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(res_songs)
X_embedded.shape
(1473, 2)
with open('TSNE.npy', 'wb') as f:
np.save(f, X_embedded)
import pandas as pd
df = pd.DataFrame(X_embedded)
df['artist'] = res_artist
df['artist'].replace('','Unknown',inplace=True)
import plotly.express as px
fig = px.scatter(df, x=0, y=1, color="artist")
fig.show()
col = []
for i in df['artist']:
try:
col.append(list(un_art.keys()).index(i))
except:
col.append(-1)
import matplotlib.pyplot as plt
plt.scatter(df[0],df[1],c = col)
plt.show()
user_id = 1
n_users, n_items = train.shape
scores = model.predict(user_id, np.arange(n_items))
scores
array([ 0.3914338 , 0.5463036 , 0.5903043 , ..., -0.4615834 ,
-0.4641547 , -0.50609154], dtype=float32)